In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_decision_regions
from gensim.models import Word2Vec
In [2]:
!pip install numpy pandas matplotlib seaborn scikit-learn mlxtend gensim
Requirement already satisfied: numpy in d:\anaconda\lib\site-packages (1.23.5) Requirement already satisfied: pandas in d:\anaconda\lib\site-packages (2.2.3) Requirement already satisfied: matplotlib in d:\anaconda\lib\site-packages (3.9.2) Requirement already satisfied: seaborn in d:\anaconda\lib\site-packages (0.13.2) Requirement already satisfied: scikit-learn in d:\anaconda\lib\site-packages (1.5.2) Requirement already satisfied: mlxtend in d:\anaconda\lib\site-packages (0.23.4) Requirement already satisfied: gensim in d:\anaconda\lib\site-packages (4.3.3) Requirement already satisfied: python-dateutil>=2.8.2 in d:\anaconda\lib\site-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in d:\anaconda\lib\site-packages (from pandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in d:\anaconda\lib\site-packages (from pandas) (2023.3) Requirement already satisfied: contourpy>=1.0.1 in d:\anaconda\lib\site-packages (from matplotlib) (1.3.1) Requirement already satisfied: cycler>=0.10 in d:\anaconda\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in d:\anaconda\lib\site-packages (from matplotlib) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in d:\anaconda\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: packaging>=20.0 in d:\anaconda\lib\site-packages (from matplotlib) (24.1) Requirement already satisfied: pillow>=8 in d:\anaconda\lib\site-packages (from matplotlib) (11.0.0) Requirement already satisfied: pyparsing>=2.3.1 in d:\anaconda\lib\site-packages (from matplotlib) (3.2.0) Requirement already satisfied: scipy>=1.6.0 in d:\anaconda\lib\site-packages (from scikit-learn) (1.13.1) Requirement already satisfied: joblib>=1.2.0 in d:\anaconda\lib\site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in d:\anaconda\lib\site-packages (from scikit-learn) (3.5.0) Requirement already satisfied: smart-open>=1.8.1 in d:\anaconda\lib\site-packages (from gensim) (7.1.0) Requirement already satisfied: six>=1.5 in d:\anaconda\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Requirement already satisfied: wrapt in d:\anaconda\lib\site-packages (from smart-open>=1.8.1->gensim) (1.14.1)
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name' WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name' WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name' WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
In [3]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
df = df.select_dtypes(include =[np.number]) # keep only numerical columns
# Remove rows and columns that would lead to df being singular
df = df.dropna(axis='columns')
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
columnNames = list(df)
if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
columnNames = columnNames[:10]
df = df[columnNames]
ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='hist')
corrs = df.corr().values
for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
ax[i, j].annotate('%.1f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
plt.suptitle('Таблица анализа данных, коэффициент корреляции')
plt.show()
In [4]:
def decision_boundary_plot(X, y, X_train, y_train, clf, feature_indexes, title=None):
feature1_name, feature2_name = X.columns[feature_indexes]
X_feature_columns = X.values[:, feature_indexes]
X_train_feature_columns = X_train[:, feature_indexes]
clf.fit(X_train_feature_columns, y_train)
plot_decision_regions(X=X_feature_columns, y=y.values, clf=clf)
plt.xlabel(feature1_name)
plt.ylabel(feature2_name)
plt.title(title)
In [5]:
# загружаем полный датасет
raw_table_data = pd.read_csv('https://raw.githubusercontent.com/TAUforPython/BioMedAI/main/test_datasets/test_data_ECG.csv', nrows=5000)
# raw_table_data = pd.read_csv()
raw_table_data.head(10)
Out[5]:
| subject_id | Count_subj | study_id | cart_id | Healthy_Status | eeg_time | eeg_date | report_0 | report_1 | report_2 | ... | filtering | rr_interval | p_onset | p_end | qrs_onset | qrs_end | t_end | p_axis | qrs_axis | t_axis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19557662 | 27 | 40000017 | 6848296 | 0 | 8:44 AM | 27.06.2015 | Sinus rhythm | Possible right atrial abnormality | NaN | ... | 60 Hz notch Baseline filter | 659 | 40 | 128 | 170 | 258 | 518 | 81 | 77 | 79 |
| 1 | 18477137 | 93 | 40000029 | 6848296 | 0 | 9:54 AM | 27.06.2015 | Sinus rhythm | Possible right atrial abnormality | NaN | ... | 60 Hz notch Baseline filter | 722 | 40 | 124 | 162 | 246 | 504 | 77 | 75 | 70 |
| 2 | 16598616 | 3 | 40000035 | 6376932 | 1 | 9:07 AM | 28.06.2015 | Sinus tachycardia | NaN | Normal ECG except for rate | ... | 60 Hz notch Baseline filter | 600 | 40 | 130 | 162 | 244 | 474 | 79 | 72 | 77 |
| 3 | 16368287 | 7 | 40000079 | 6214760 | 1 | 5:14 PM | 15.07.2015 | Sinus rhythm | NaN | Normal ECG | ... | 60 Hz notch Baseline filter | 659 | 40 | 146 | 180 | 254 | 538 | 79 | 66 | 69 |
| 4 | 18370366 | 2 | 40000084 | 6632385 | 0 | 1:52 PM | 27.09.2015 | Sinus rhythm | NaN | NaN | ... | <not specified> | 659 | 368 | 29999 | 504 | 590 | 868 | 84 | 80 | 77 |
| 5 | 15606157 | 55 | 40000089 | 6632385 | 0 | 2:29 PM | 29.10.2013 | Sinus rhythm | NaN | NaN | ... | <not specified> | 822 | 365 | 29999 | 499 | 592 | 852 | 26 | 46 | 30 |
| 6 | 12576058 | 43 | 40000115 | 6852956 | 1 | 12:54 PM | 23.03.2016 | Sinus rhythm | NaN | Normal ECG | ... | 60 Hz notch Baseline filter | 952 | 40 | 146 | 198 | 282 | 598 | 24 | 80 | 20 |
| 7 | 14691089 | 1 | 40000143 | 6551957 | 0 | 10:01 AM | 10.12.2016 | Sinus rhythm | rSr'(V1) - probable normal variant | Low QRS voltages in precordial leads | ... | 60 Hz notch Baseline filter | 923 | 40 | 140 | 188 | 278 | 594 | 26 | 86 | 13 |
| 8 | 14144725 | 7 | 40000144 | 6924910 | 0 | 7:24 AM | 11.12.2011 | Sinus rhythm with PAC(s). | NaN | Borderline ECG | ... | 60 Hz notch Baseline filter | 952 | 40 | 180 | 196 | 294 | 610 | 59 | -17 | 3 |
| 9 | 16089780 | 2 | 40000152 | 6919786 | 0 | 12:35 PM | 13.12.2011 | Sinus rhythm | Extensive T wave changes may be due to myocard... | NaN | ... | 60 Hz notch Baseline filter | 1000 | 40 | 156 | 178 | 274 | 584 | 8 | -11 | 19 |
10 rows × 36 columns
In [6]:
# Предобрабатываем данные словестных репортов word2vec в числовой вектор
# 这段代码通过去除异常值、合并和清理文本报告数据,使用 Word2Vec 模型将文本转化为数值表示,并将其作为新的特征用于后续分析。
# Убираем выбросы数据过滤,过滤掉所有在 columns_to_filter 列中具有大于 2000 的值的行,
# 筛选出 p_onset 小于 p_end 且 qrs_onset 小于 qrs_end 的行。这是为了去除数据中的异常值(如时间顺序错误等)
columns_to_filter = ['rr_interval', 'p_onset', 'p_end', 'qrs_onset', 'qrs_end', 't_end', 'p_axis', 'qrs_axis', 't_axis']
full_df_filtered = raw_table_data[(raw_table_data[columns_to_filter] < 2000).all(axis=1)]
full_df_filtered = full_df_filtered[(full_df_filtered['p_onset'] < full_df_filtered['p_end']) & (full_df_filtered['qrs_onset'] < full_df_filtered['qrs_end'])]
# Слепляем все текстовые отчеты в один большой文本数据预处理,
# 将 report_0 到 report_17 列中的文本数据合并成一个单一的文本列 report。每个报告通过空格连接。接着,去除字符串中的 nan 和多余的空格,确保文本格式清洁。
reports = [f'report_{x}' for x in range(18)]
full_df_filtered['report_0'] = full_df_filtered[reports].astype(str).agg(' '.join, axis=1)
full_df_filtered['report_0'] = full_df_filtered['report_0'].str.replace(r'\bnan\b', '', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
full_df_filtered.rename(columns={'report_0': 'report'}, inplace=True)
reports_to_drop = [f'report_{x}' for x in range(1, 18)]
full_df_filtered = full_df_filtered.drop(reports_to_drop, axis=1)
# Фиксим имена столбцов 修正列名字;删除了 bandwidth 和 filtering 列(无关列)
full_df_filtered = full_df_filtered.rename(columns={'eeg_time ': 'eeg_time', 'eeg_date ': 'eeg_date'})
full_df_filtered = full_df_filtered.drop(columns = ['bandwidth', 'filtering'])
# Делаем колонку с таргетами крайней справа将 Healthy_Status 列移到数据框的最右侧
full_df_filtered = full_df_filtered[[col for col in full_df_filtered.columns if col != 'Healthy_Status'] + ['Healthy_Status']]
# Разбиваем столбец с текстовым отчетом на токены分词,将每个报告report的文本按空格进行分割,转换成词语列表
words = [text.split() for text in full_df_filtered['report']]
# Обучаем модель Word2Vec使用 Word2Vec 模型训练分词后的文本数据。这个模型会将每个词转换为一个向量。
w2v_model = Word2Vec(words)
# Функция для получения среднего эмбеддинга строки
# 定义了一个函数 get_sentence_embedding,用于计算每个报告的句子嵌入向量。首先将报告拆分成单词,然后从训练好的 Word2Vec 模型中提取每个单词的向量。
# 如果该单词有对应的向量,则返回所有词向量的均值;如果没有,则返回一个全零的向量。
# 之后,通过 .apply 方法将该函数应用到所有报告上,并将每个报告的嵌入向量的均值作为该报告的数值表示。
def get_sentence_embedding(sentence):
words = sentence.split()# 将输入的 sentence(句子)按空格拆分成一个个单词
word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
# 对于句子中的每个单词,代码尝试从训练好的 Word2Vec 模型 w2v_model 中查找该单词的词向量
# 如果该单词在 Word2Vec 模型的词汇表中(word in w2v_model.wv),则提取对应的词向量;如果不在词汇表中,则跳过该单词。
if word_vectors:
return np.mean(word_vectors, axis=0) # Усреднённый вектор
#如果该句子中至少有一个词向量(即 word_vectors 列表非空),则通过 np.mean(word_vectors, axis=0) 计算所有单词词向量的均值。
# 这意味着句子的嵌入向量是句子中所有单词的向量的平均值。这样可以代表整个句子的含义。
else:
return np.zeros(w2v_model.vector_size) # Нулевой вектор, если слова нет в модели
# 如果句子中的单词都不在词汇表中,word_vectors 列表为空,则返回一个全零向量(np.zeros(w2v_model.vector_size)),即用一个默认的零向量表示这个句子。
# Применяем функцию ко всей колонке и переводим каждый вектор в одно число
# 查看预处理后的数据: 显示数据框的前几行,以检查预处理的结果。
full_df_filtered['report'] = full_df_filtered['report'].apply(lambda x: get_sentence_embedding(x).mean())
full_df_filtered.head()
Out[6]:
| subject_id | Count_subj | study_id | cart_id | eeg_time | eeg_date | report | rr_interval | p_onset | p_end | qrs_onset | qrs_end | t_end | p_axis | qrs_axis | t_axis | Healthy_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19557662 | 27 | 40000017 | 6848296 | 8:44 AM | 27.06.2015 | 0.012239 | 659 | 40 | 128 | 170 | 258 | 518 | 81 | 77 | 79 | 0 |
| 1 | 18477137 | 93 | 40000029 | 6848296 | 9:54 AM | 27.06.2015 | 0.012239 | 722 | 40 | 124 | 162 | 246 | 504 | 77 | 75 | 70 | 0 |
| 2 | 16598616 | 3 | 40000035 | 6376932 | 9:07 AM | 28.06.2015 | 0.035913 | 600 | 40 | 130 | 162 | 244 | 474 | 79 | 72 | 77 | 1 |
| 3 | 16368287 | 7 | 40000079 | 6214760 | 5:14 PM | 15.07.2015 | 0.022903 | 659 | 40 | 146 | 180 | 254 | 538 | 79 | 66 | 69 | 1 |
| 6 | 12576058 | 43 | 40000115 | 6852956 | 12:54 PM | 23.03.2016 | 0.022903 | 952 | 40 | 146 | 198 | 282 | 598 | 24 | 80 | 20 | 1 |
In [7]:
# make new dataset from old columns
table_data = full_df_filtered[['report','rr_interval','p_end','qrs_onset','qrs_end','t_end','p_axis','qrs_axis','t_axis','Healthy_Status']].copy()
table_data.head()
Out[7]:
| report | rr_interval | p_end | qrs_onset | qrs_end | t_end | p_axis | qrs_axis | t_axis | Healthy_Status | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.012239 | 659 | 128 | 170 | 258 | 518 | 81 | 77 | 79 | 0 |
| 1 | 0.012239 | 722 | 124 | 162 | 246 | 504 | 77 | 75 | 70 | 0 |
| 2 | 0.035913 | 600 | 130 | 162 | 244 | 474 | 79 | 72 | 77 | 1 |
| 3 | 0.022903 | 659 | 146 | 180 | 254 | 538 | 79 | 66 | 69 | 1 |
| 6 | 0.022903 | 952 | 146 | 198 | 282 | 598 | 24 | 80 | 20 | 1 |
In [8]:
# 绘制数据框(table_data)中每一列的箱形图(boxplot),用于可视化每个特征的数据分布情况。
n = table_data.shape[1]
fig,ax = plt.subplots(1,n, figsize=(n*2,4), sharex=True)
for i in range(n):
plt.sca(ax[i])
col = table_data.columns[i]
sns.boxplot(y = table_data.iloc[:,i],data=table_data, medianprops={"color": "r", "linewidth": 2})
In [9]:
# удаляем аномалии из данных (убрать все значения более 5000)
#table_data = table_data.drop(table_data[table_data['qrs_axis'] > 5000].index)
for i, v in enumerate(table_data):
table_data = table_data.drop(table_data[table_data.iloc[:,i] > 10000].index)
In [10]:
#绘制新的箱型图
n = table_data.shape[1]
fig,ax = plt.subplots(1,n, figsize=(n*2,4), sharex=True)
for i in range(n):
plt.sca(ax[i]) #每个子图共享 X 轴,便于比较不同列的数据分布
col = table_data.columns[i] # 获取当前列的名称
sns.boxplot(y = table_data.iloc[:,i],data=table_data,
medianprops={"color": "r", "linewidth": 2})#绘制箱形图
plt.tight_layout# 调整子图的布局,避免子图之间的重叠,确保每个子图都能清晰地显示出来。这个函数会自动调整子图之间的间距,使得图形看起来更加整洁。
In [11]:
sns.countplot(data=table_data, x="Healthy_Status") #使用sns绘制条形图
plt.title("Distribution of Healthy Status")
plt.show()
In [12]:
#example view 绘制 table_data 数据框中的 rr_interval 列的数据
plt.plot(table_data['rr_interval'],'.')
Out[12]:
[<matplotlib.lines.Line2D at 0x2f90fd29b90>]
In [13]:
# 这段代码用于绘制一个热力图(heatmap),展示 table_data 数据框中各列之间的相关性。
sns.heatmap(table_data.corr(),annot=True,fmt="0.2f",cmap="coolwarm")#设置显示的数值格式为保留两位小数;指定热力图的配色方案
plt.show()
In [14]:
# 调用了之前讨论的 plotScatterMatrix 函数,并传递了 table_data 数据框以及绘图的尺寸参数
plotScatterMatrix(table_data, 7, 10)
# plotScatterMatrix 函数的功能:
# 选择数值型列:plotScatterMatrix 函数首先会选择数据框中所有数值型的列进行绘制。
# 去除缺失值和常量列:函数会去除包含缺失值的列,并保留那些至少有两个不同值的列。这样可以避免在绘制时出现奇异的结果。
# 绘制散点矩阵:接着,函数会使用 pandas.plotting.scatter_matrix 绘制散点矩阵图。每对变量之间都会有一个散点图,主对角线则是该变量的直方图。
# 计算并显示相关性:在每个非对角线的散点图上,函数会计算并显示每对变量之间的相关性系数,帮助你观察不同特征之间的线性关系。
In [15]:
fig,ax = plt.subplots(figsize=(15,5))# 创建一个包含子图(ax)的图形对象(fig)
#sns.lineplot(x='age',y='debtinc',data=table_data,ax=ax)
sns.lineplot(x='rr_interval',y='t_end',data=table_data,ax=ax) #h绘制折线图(两者关系)
Out[15]:
<Axes: xlabel='rr_interval', ylabel='t_end'>
In [16]:
# 使用 Seaborn 绘制了一组散点图矩阵(pairplot),其中的每一对变量之间的关系被展示为散点图。
# 它还使用 Healthy_Status 列作为类别来着色,帮助观察不同健康状态类别之间的分布差异。
#labels = pd.DataFrame(table_data['Healthy_Status'])
sns.pairplot(table_data, hue = "Healthy_Status", #Healthy_Status 列的不同值会用不同的颜色显示,帮助我们区分不同健康状态类别的数据点。
height=1.5, # 设置每个子图的高度为 1.5 英寸。这里设置较小的 height 是为了调整图形的紧凑性,适合小尺寸的数据集。如果数据集较大,可以增大这个值。
plot_kws=dict(alpha=0.3))# alpha=0.3 表示散点图中的数据点的透明度设置为 30%。
Out[16]:
<seaborn.axisgrid.PairGrid at 0x2f90fcbe1d0>
In [17]:
# 使用 Seaborn 绘制了一个 散点图(stripplot),该图展示了 rr_interval 和 qrs_end 这两个变量之间的关系,并且使用 Healthy_Status 列来给数据点着色
sns.stripplot(x=table_data['rr_interval'],y=table_data['qrs_end'],
hue=table_data['Healthy_Status'],jitter=True)
Out[17]:
<Axes: xlabel='rr_interval', ylabel='qrs_end'>
In [18]:
# 使用 seaborn 库的 swarmplot 函数绘制了一个 蜂群图,展示了 rr_interval 与 Healthy_Status 之间的关系
sns.swarmplot(data=table_data, x="rr_interval", hue="Healthy_Status")
Out[18]:
<Axes: xlabel='rr_interval'>
D:\anaconda\Lib\site-packages\seaborn\categorical.py:3399: UserWarning: 58.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning)
In [20]:
# 使用 Seaborn 的 violinplot 函数绘制了 rr_interval 在不同健康状态下的分布情况。
# 小提琴图是一种综合了箱形图和密度图的图表,展示了数据的分布、集中趋势和变异性。
plt.figure(figsize=(14, 6))
sns.violinplot(data=table_data, y='rr_interval', x='Healthy_Status')
plt.title('rr_interval Distribution by Medical Condition')
plt.show()
In [21]:
# 以上为数据预处理部分+可视化部分
In [22]:
# 以下为PCA分析和t-SNE分析
In [23]:
# PCA主成分分析前的准备工作
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
In [24]:
#table_data_pca = full_df_filtered #将 full_df_filtered 数据框赋值给 table_data_pca。
table_data_pca = table_data
#full_df_filtered 是经过处理的原始数据,而 table_data_pca 是将用于后续PCA分析的目标数据框
table_data_pca = table_data_pca.drop('Healthy_Status', axis = 1)
# 删除数据框 table_data_pca 中的 Healthy_Status 列。
# 因为 PCA 是一种无监督的降维技术,通常仅用于数值特征数据,因此需要去除目标列(这里的 Healthy_Status 列)进行降维
table_data_pca.columns # 返回 table_data_pca 数据框中剩余的列名
Out[24]:
Index(['report', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 't_end',
'p_axis', 'qrs_axis', 't_axis'],
dtype='object')
In [25]:
#cancer = load_breast_cancer()
# Before applying PCA, each feature should be centered (zero mean) and with unit variance
scaled_data = StandardScaler().fit(table_data_pca).transform(table_data_pca)# 将数据按列进行标准化处理,使得每列的均值为 0,标准差为 1。
pca = PCA(n_components = 2).fit(scaled_data) # 创建了一个 PCA 对象,指定将数据降到 2 维;即希望通过 PCA 提取前两个主成分(即二维的降维)
# PCA(copy=True, n_components=2, whiten=False)
x_pca = pca.transform(scaled_data)
#将标准化后的数据 scaled_data 转换到主成分空间。通过这个步骤,数据会被投影到新的坐标系中,其中第一主成分和第二主成分是主要的轴
print(table_data.shape, x_pca.shape)
(3448, 10) (3448, 2)
In [26]:
percent = pca.explained_variance_ratio_# 这是 PCA 模型的一个属性,表示每个主成分对数据方差的贡献比例
print(percent)
print(sum(percent))
#To see how much variance is preserved for each dataset.
# 结果中第一个主成分解释了34%的方差,第二个主成分解释了13%的方差,两个主成分共同保留了48%的数据方差
[0.34425404 0.13828051] 0.48253454649422284
In [27]:
# 这段代码定义了一个名为 pca_explained 的函数,用于计算并显示每个主成分在主成分分析(PCA)中所解释的方差比例,直到累计方差解释比例超过指定的阈值。
def pca_explained(X, threshold):
features = X.shape[1]
for i in range(2, features):
pca = PCA(n_components = i).fit(X)
sum_ = pca.explained_variance_ratio_
# add all components explained variances
percent = sum(sum_)
print('{} components at {:.2f}% explained variance'.format(i,percent*100))
if percent > threshold:
break
pca_explained(scaled_data, 0.85)
# 结果含义:
# 使用前 2 个主成分时,能够解释约 48.12% 的方差。
# 使用前 3 个主成分时,能够解释约 61.17%的方差。
# 使用前 4 个主成分时,能够解释约 72.99% 的方差,
# 使用前 5 个主成分时,能够解释约 82.25% 的方差,
# 使用前 6 个主成分时,能够解释约 90.23% 的方差,超过阈值,停止计算
2 components at 48.25% explained variance 3 components at 61.27% explained variance 4 components at 73.08% explained variance 5 components at 82.27% explained variance 6 components at 90.26% explained variance
In [28]:
# 将通过 PCA 得到的前两个主成分绘制为散点图,并根据 Healthy_Status 列的数据进行着色,以显示健康状态类别之间的分布情况
plt.figure(figsize=(8,6)) # 创建一个图形,并设置图形的大小为 8 英寸宽,6 英寸高
plt.scatter(x_pca[:,0], x_pca[:,1], c=full_df_filtered['Healthy_Status'], cmap='plasma', alpha=0.4, edgecolors='black', s=65);
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
Out[28]:
Text(0, 0.5, 'Second Principal Component')
In [29]:
# 首先将 PCA 的成分值(即主成分的载荷)放入一个 DataFrame,并绘制了一个散点图,展示了降维后的数据。
# 在散点图的基础上,还展示了每个特征在主成分空间中的投影,利用箭头和标签指示每个特征如何在主成分上进行投影。
# put feature values into dataframe
components = pd.DataFrame(pca.components_.T, index=table_data_pca.columns, columns= ['PCA1','PCA2'])
# 绘制散点图
# plot size
plt.figure(figsize=(10,8))
# main scatterplot
plt.scatter(x_pca[:,0], x_pca[:,1], c=full_df_filtered['Healthy_Status'],
cmap='plasma', alpha=0.4, edgecolors='black', s=40);
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
#plt.ylim(15,-15);
#plt.xlim(20,-20);
# individual feature values添加第二个坐标轴
ax2 = plt.twinx().twiny();
#ax2.set_ylim(-0.5,0.5);
#ax2.set_xlim(-0.5,0.5);
# reference lines设置参考线;虚线用于在主成分空间中表示零点,帮助判断特征在主成分方向上的分布
ax2.hlines(0,-0.5,0.5, linestyles='dotted', colors='grey')
ax2.vlines(0,-0.5,0.5, linestyles='dotted', colors='grey')
# offset for labels绘制箭头和文本;箭头的方向和长度表示该特征在各主成分上的贡献
offset = 0.95
# arrow & text
for a, i in enumerate(components.index):
ax2.arrow(0, 0, components['PCA1'][a], -components['PCA2'][a], alpha=0.5, facecolor='white', head_width=.01)
ax2.annotate(i, (components['PCA1'][a]*offset, -components['PCA2'][a]*offset), color='orange')
C:\Users\25150\AppData\Local\Temp\ipykernel_9032\1802625492.py:30: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ax2.arrow(0, 0, components['PCA1'][a], -components['PCA2'][a], alpha=0.5, facecolor='white', head_width=.01) C:\Users\25150\AppData\Local\Temp\ipykernel_9032\1802625492.py:31: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ax2.annotate(i, (components['PCA1'][a]*offset, -components['PCA2'][a]*offset), color='orange')
In [30]:
# 可视化 PCA(主成分分析)中每个主成分的成分矩阵(载荷矩阵)。
# 使用 imshow 函数生成一个热图,以展示特征在主成分上的权重(即每个特征对主成分的贡献程度)
fig = plt.figure(figsize=(8, 4))#创建图形和设置大小
plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')#绘制热图
feature_names = list(table_data_pca.columns)# 设置坐标轴
plt.gca().set_xticks(np.arange(-.5, len(feature_names)-1));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(feature_names, rotation=90, ha='left',fontsize=12);#设置刻度标签
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom',fontsize=12);
plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0,
pca.components_.max()],pad=0.65);#添加颜色条
In [31]:
# 以上为PCA主成分分析法
In [32]:
# 以下为t-SNE方法
In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
In [34]:
# 将目标列 'Healthy_Status' 排除,假设你已经准备好了数据集
#X = full_df_filtered.drop(columns=['Healthy_Status']) # 去除目标变量列
#y = full_df_filtered['Healthy_Status'] # 目标变量
X = table_data.drop(columns=['Healthy_Status']) # 去除目标变量列
y = table_data['Healthy_Status'] # 目标变量
# 对特征数据进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 使用t-SNE进行降维,将数据降到2维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
# 创建 DataFrame 存储 t-SNE 降维后的数据
tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
tsne_df['Healthy_Status'] = y
# 绘制 t-SNE 结果
plt.figure(figsize=(10, 8))
scatter = plt.scatter(tsne_df['TSNE1'], tsne_df['TSNE2'], c=tsne_df['Healthy_Status'], cmap='plasma', alpha=0.6, edgecolors='k', s=50)
# 添加颜色条
plt.colorbar(scatter)
plt.title('t-SNE visualization of the dataset')
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.show()
In [35]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
# 假设 table_data 已经加载并进行了预处理
# 提取特征数据和目标变量
X = table_data.drop(columns=['Healthy_Status']) # 去掉目标变量 'Healthy_Status'
y = table_data['Healthy_Status'] # 提取目标变量 'Healthy_Status'
# 对特征数据进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 创建 t-SNE 实例,并将数据降维至 3 维
tsne = TSNE(n_components=3, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)
# 创建 DataFrame 存储 t-SNE 降维后的数据
tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3']) # 三个主成分
tsne_df['Healthy_Status'] = y # 添加目标变量
# 绘制三维 t-SNE 结果
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d') # 创建三维坐标轴
# 绘制散点图
scatter = ax.scatter(tsne_df['TSNE1'], tsne_df['TSNE2'], tsne_df['TSNE3'], c=tsne_df['Healthy_Status'], cmap='plasma', alpha=0.6, edgecolors='k', s=50)
# 设置标题和标签
ax.set_title("3D t-SNE Visualization of the Dataset")
ax.set_xlabel("t-SNE Component 1")
ax.set_ylabel("t-SNE Component 2")
ax.set_zlabel("t-SNE Component 3")
# 添加颜色条
fig.colorbar(scatter)
# 显示图表
plt.show()
In [36]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
# 生成样本数据
#X, y = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)
X = table_data.drop(columns=['Healthy_Status']) # 去除目标变量列
y = table_data['Healthy_Status'] # 目标变量
# 可视化生成的数据
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, s=30, cmap='viridis')
plt.title("Original Dataset")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()
In [37]:
from sklearn.manifold import TSNE
# 创建 t-SNE 实例,并将数据降维至 2 维
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)
# 可视化降维后的数据
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, s=30, cmap='viridis')
plt.title("t-SNE Reduced Data")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()
In [38]:
#---------------------以下为lab3--------------------------
In [39]:
#-----ML: GussianClassifier + описание классов и матрица ошибок---------------
# 高斯分类器+类别描述+错误矩阵/混淆矩阵
In [40]:
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
In [41]:
table_data.columns
Out[41]:
Index(['report', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 't_end',
'p_axis', 'qrs_axis', 't_axis', 'Healthy_Status'],
dtype='object')
In [42]:
# set the classification index of table
clf_index = 9
# 设置分类目标列的索引,第9列是你要分类的类别列(target class)
In [43]:
#类别描述
plt.figure(figsize=(35,20),dpi=90)
# Plot frequency percentages barplot
table_data[table_data.columns[clf_index]].value_counts(normalize=True).mul(100).plot(kind='barh', width=0.8, figsize=(8,5))
# Add frequency percentages to the plot
labels = table_data[table_data.columns[clf_index]].value_counts(normalize=True).mul(100).round(1)
for i in labels.index:
plt.text(labels[i], i, str(labels[i])+ '%', fontsize=15, weight='bold')
plt.xlim([0, 110])
plt.xlabel('Frequency Percentage', fontsize=13)
plt.ylabel(table_data.columns[clf_index], fontsize=13)
plt.title('Frequency Percentage of Target Classes', fontsize=13)
plt.show()
In [44]:
#提取分类目标(y),提取输入特征(X),并把数据集划分为训练集和测试集(70%训练,30%测试)
#同时保持类别比例一致(stratify)。
# classification variable 从表格中取出第 clf_index 列,作为目标分类变量(y
y1 = table_data.iloc[:, clf_index]
# input variable 删除掉第 clf_index 列(目标变量那一列),剩下的全部作为输入特征(X)
X1 = table_data.drop(table_data.columns[[clf_index]], axis=1).iloc[:,:]
# 用 LabelEncoder 对目标变量 y1 进行编码(将字符串类别转换为数字,如:A→0,B→1)
# 并转成 pandas 的 Series 类型。
y1 = pd.Series(LabelEncoder().fit_transform(y1))
# 这行代码的作用是将数据随机分为训练集和测试集
X1_train, X1_test, y1_train, y1_test = train_test_split(X1.values,
y1.values,
test_size=0.3,
random_state=0,
stratify=y1.values)
In [45]:
# 各部分的形状信息(即样本数量和特征维度)
X_train,X_test,y_train,y_test = train_test_split(X1.values,
y1.values,
test_size=0.3,
random_state=41,
stratify=y1.values)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
X_train shape: (2413, 9) y_train shape: (2413,) X_test shape: (1035, 9) y_test shape: (1035,)
In [46]:
#使用 GaussianNB(高斯朴素贝叶斯分类器)进行训练和预测,并评估分类效果
GussianClassifier = GaussianNB()
GussianClassifier.fit(X1_train,y1_train)
y_pred=GussianClassifier.predict(X_test)
print("===================================> Result <===================================")
print("Accuracy = " ,metrics.accuracy_score(y_test,y_pred))
print("F1 Score = " ,metrics.f1_score(y_test,y_pred))
===================================> Result <=================================== Accuracy = 0.927536231884058 F1 Score = 0.8803827751196173
In [47]:
# 通过混淆矩阵评估分类效果
from sklearn.metrics import ConfusionMatrixDisplay
In [48]:
# 分类模型评估流程
target_names = ['True', 'False']
labels_names = [0,1]
print(classification_report(y_test, y_pred,labels=labels_names, target_names=target_names))
# 生成分类报告
#cm = confusion_matrix(y_test, y_pred,labels=labels_names,normalize='true')
cm = confusion_matrix(y_test, y_pred,labels=labels_names)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=target_names)
disp = disp.plot(cmap=plt.cm.Blues,values_format='g')
plt.show()
# 绘制混淆矩阵
precision recall f1-score support
True 0.98 0.92 0.95 747
False 0.81 0.96 0.88 288
accuracy 0.93 1035
macro avg 0.90 0.94 0.91 1035
weighted avg 0.94 0.93 0.93 1035
In [49]:
cm = confusion_matrix(y_test,GussianClassifier.predict(X_test))
fig , ax = plt.subplots(figsize=(4,4))
ax.imshow(cm, cmap = 'plasma')
ax.grid(False)
ax.xaxis.set(ticks=(0,1),ticklabels=("Predicted as True","Predicted as False"))
ax.yaxis.set(ticks=(0,1),ticklabels=("Actual as True","Actual as False"))
ax.set_ylim(1.5,-0.5)
for i in range(2):
for j in range(2):
ax.text(j,i,cm[i,j],ha="center",va="center",color = "red")
In [50]:
# --------------举例 AutoML----------------
# Убираем из датасета лишние столбцы
# Формируем датасет для AutoML методов
autoML_df = full_df_filtered[['report', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 't_end', 'p_axis', 'qrs_axis', 't_axis', 'Healthy_Status']].copy()
In [51]:
# AutoML H2O 使用的标准启动流程
# 安装 H2O Python 包(需要联网且在支持 Jupyter / Colab / 本地环境下运行)
!pip install h2o
Requirement already satisfied: h2o in d:\anaconda\lib\site-packages (3.46.0.6) Requirement already satisfied: requests in d:\anaconda\lib\site-packages (from h2o) (2.32.3) Requirement already satisfied: tabulate in d:\anaconda\lib\site-packages (from h2o) (0.9.0) Requirement already satisfied: charset-normalizer<4,>=2 in d:\anaconda\lib\site-packages (from requests->h2o) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in d:\anaconda\lib\site-packages (from requests->h2o) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in d:\anaconda\lib\site-packages (from requests->h2o) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in d:\anaconda\lib\site-packages (from requests->h2o) (2024.8.30)
WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name' WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name' WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name' WARNING: Skipping D:\anaconda\Lib\site-packages\scipy-1.14.1.dist-info due to invalid metadata entry 'name'
In [52]:
import h2o
from h2o.frame import H2OFrame
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, roc_auc_score
In [54]:
# Инициализируем H2O
h2o.init()
Checking whether there is an H2O instance running at http://localhost:54321. connected. Warning: Your H2O cluster version is (4 months and 22 days) old. There may be a newer version available. Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
| H2O_cluster_uptime: | 43 secs |
| H2O_cluster_timezone: | Europe/Moscow |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.46.0.6 |
| H2O_cluster_version_age: | 4 months and 22 days |
| H2O_cluster_name: | H2O_from_python_25150_atuyo6 |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 7.882 Gb |
| H2O_cluster_total_cores: | 32 |
| H2O_cluster_allowed_cores: | 32 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://localhost:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.11.5 final |
In [55]:
#数据格式转换、数据集划分、指定目标变量和特征列
# Переводим датафрейм в формат H2OFrame
# 把 Pandas DataFrame(比如 autoML_df)转换为 H2O 使用的专有数据格式 H2OFrame,这是 H2O 建模的输入格式
h2o_df = H2OFrame(autoML_df)
#h2o_df['Healthy_Status'] = h2o_df['Healthy_Status'].asfactor()#这样 H2O 就会知道这是一个二分类问题,而不是连续值回归。
# Разбиение на обучающую и тестовую выборки
# 把 H2O 数据集 h2o_df 按照 85% : 15% 的比例,随机划分为训练集(train)和测试集(test)
random_seed = 17
train, test = h2o_df.split_frame(ratios=[0.85], seed=random_seed)
# Определяем целевую переменную и предикторы
# 指定 H2O 自动建模中的“目标变量”和“输入特征”
y="Healthy_Status" # 你要预测的字段(即分类或回归目标)
x = list(h2o_df.columns) # 获取所有列名
x.remove(y) # 把目标变量 y 从特征列表中移除,剩下的就是输入变量(predictors)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
In [56]:
# 自动训练多种机器学习模型并自动选择最佳模型,然后在测试集上评估性能
# Создаем и обучаем модель H2O AutoML
automl = H2OAutoML(max_runtime_secs=60, seed=random_seed, verbosity="info")
# 最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);
#automl= H2OAutoML(max_models = 10, seed = 10, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0)
automl.train(x=x, y=y, training_frame=train)
# Оценка модели на тестовом наборе
performance = automl.leader.model_performance(test_data=test)
print(performance)
AutoML progress: |█
00:50:08.271: Project: AutoML_1_20250325_05008
00:50:08.272: 5-fold cross-validation will be used.
00:50:08.272: Setting stopping tolerance adaptively based on the training frame: 0.018464772811525407
00:50:08.272: Build control seed: 17
00:50:08.273: training frame: Frame key: AutoML_1_20250325_05008_training_py_2_sid_94de cols: 10 rows: 2933 chunks: 1 size: 68508 checksum: 120712254201011546
00:50:08.273: validation frame: NULL
00:50:08.273: leaderboard frame: NULL
00:50:08.273: blending frame: NULL
00:50:08.273: response column: Healthy_Status
00:50:08.273: fold column: null
00:50:08.273: weights column: null
00:50:08.282: AutoML: XGBoost is not available; skipping it.
00:50:08.290: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}, {DeepLearning : [def_1 (3g, 10w), grid_1 (4g, 30w), grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {completion : [resume_best_grids (10g, 60w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w), best_of_family_2 (2g, 5w), best_of_family_3 (3g, 5w), best_of_family_4 (4g, 5w), best_of_family_5 (5g, 5w), all_2 (2g, 10w), all_3 (3g, 10w), all_4 (4g, 10w), all_5 (5g, 10w), monotonic (6g, 10w), best_of_family_gbm (6g, 10w), all_gbm (7g, 10w), best_of_family_xglm (8g, 10w), all_xglm (8g, 10w), best_of_family (10g, 10w), best_N (10g, 10w)]}]
00:50:08.307: Disabling Algo: XGBoost as requested by the user.
00:50:08.308: AutoML job created: 2025.03.25 00:50:08.251
00:50:08.308: AutoML build started: 2025.03.25 00:50:08.308
00:50:08.324: AutoML: starting GLM_1_AutoML_1_20250325_05008 model training
00:50:08.324: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:09.105: New leader: GLM_1_AutoML_1_20250325_05008, rmse: 0.2865832550349015
00:50:09.115: AutoML: starting GBM_1_AutoML_1_20250325_05008 model training
00:50:09.115: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:12.646: New leader: GBM_1_AutoML_1_20250325_05008, rmse: 0.12946059420121422
██
00:50:12.710: AutoML: starting StackedEnsemble_BestOfFamily_1_AutoML_1_20250325_05008 model training
00:50:12.710: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:12.906: New leader: StackedEnsemble_BestOfFamily_1_AutoML_1_20250325_05008, rmse: 0.12939553821337055
00:50:12.929: AutoML: starting DRF_1_AutoML_1_20250325_05008 model training
00:50:12.930: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:14.209: New leader: DRF_1_AutoML_1_20250325_05008, rmse: 0.1288446703068968
00:50:14.210: AutoML: starting GBM_2_AutoML_1_20250325_05008 model training
00:50:14.210: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:14.841: New leader: GBM_2_AutoML_1_20250325_05008, rmse: 0.1128652612873039
00:50:14.842: AutoML: starting GBM_3_AutoML_1_20250325_05008 model training
00:50:14.842: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:15.434: AutoML: starting GBM_4_AutoML_1_20250325_05008 model training
00:50:15.434: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:16.61: AutoML: starting StackedEnsemble_BestOfFamily_2_AutoML_1_20250325_05008 model training
00:50:16.61: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:16.203: AutoML: starting StackedEnsemble_AllModels_1_AutoML_1_20250325_05008 model training
00:50:16.204: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:16.378: New leader: StackedEnsemble_AllModels_1_AutoML_1_20250325_05008, rmse: 0.11231078419053396
00:50:16.378: AutoML: starting XRT_1_AutoML_1_20250325_05008 model training
00:50:16.378: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:17.284: AutoML: starting GBM_5_AutoML_1_20250325_05008 model training
00:50:17.285: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
███
00:50:18.165: New leader: GBM_5_AutoML_1_20250325_05008, rmse: 0.10833656809442252
00:50:18.171: AutoML: starting DeepLearning_1_AutoML_1_20250325_05008 model training
00:50:18.172: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:20.32: AutoML: starting StackedEnsemble_BestOfFamily_3_AutoML_1_20250325_05008 model training
00:50:20.33: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:20.199: AutoML: starting StackedEnsemble_AllModels_2_AutoML_1_20250325_05008 model training
00:50:20.199: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:50:20.355: New leader: StackedEnsemble_AllModels_2_AutoML_1_20250325_05008, rmse: 0.1082561546805813
00:50:20.359: AutoML: starting GBM_grid_1_AutoML_1_20250325_05008 hyperparameter search
███████████████████████████████
00:50:47.909: AutoML: starting DeepLearning_grid_1_AutoML_1_20250325_05008 hyperparameter search
████████████████
00:51:01.713: AutoML: starting StackedEnsemble_AllModels_3_AutoML_1_20250325_05008 model training
00:51:01.713: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:51:01.981: New leader: StackedEnsemble_AllModels_3_AutoML_1_20250325_05008, rmse: 0.10679729316052826
00:51:01.982: AutoML: starting DeepLearning_grid_2_AutoML_1_20250325_05008 hyperparameter search
█████
00:51:04.580: AutoML: starting DeepLearning_grid_3_AutoML_1_20250325_05008 hyperparameter search
00:51:07.172: AutoML: starting StackedEnsemble_BestOfFamily_4_AutoML_1_20250325_05008 model training
00:51:07.172: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
█████| (done) 100%
00:51:07.328: AutoML: starting StackedEnsemble_AllModels_4_AutoML_1_20250325_05008 model training
00:51:07.328: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:51:07.557: Retraining best GBM with learning rate annealing: GBM_5_AutoML_1_20250325_05008
00:51:07.557: AutoML: starting GBM_lr_annealing_selection_AutoML_1_20250325_05008_select_model model training
00:51:07.559: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:51:08.531: Actual modeling steps: [{GLM : [def_1 (1g, 10w)]}, {GBM : [def_5 (1g, 10w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w)]}, {DRF : [def_1 (2g, 10w)]}, {GBM : [def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w)]}, {StackedEnsemble : [best_of_family_2 (2g, 5w), all_2 (2g, 10w)]}, {DRF : [XRT (3g, 10w)]}, {GBM : [def_1 (3g, 10w)]}, {DeepLearning : [def_1 (3g, 10w)]}, {StackedEnsemble : [best_of_family_3 (3g, 5w), all_3 (3g, 10w)]}, {GBM : [grid_1 (4g, 60w)]}, {DeepLearning : [grid_1 (4g, 30w)]}, {StackedEnsemble : [all_4 (4g, 10w)]}, {DeepLearning : [grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {StackedEnsemble : [best_of_family_5 (5g, 5w), all_5 (5g, 10w)]}, {GBM : [lr_annealing (6g, 10w)]}]
00:51:08.531: AutoML build stopped: 2025.03.25 00:51:08.531
00:51:08.531: AutoML build done: built 43 models
00:51:08.531: AutoML duration: 1 min 0.223 sec
ModelMetricsRegressionGLM: stackedensemble ** Reported on test data. ** MSE: 0.005003135627156996 RMSE: 0.0707328468758115 MAE: 0.03130546386615607 RMSLE: 0.050631346579269945 Mean Residual Deviance: 0.005003135627156996 R^2: 0.9727803764776879 Null degrees of freedom: 514 Residual degrees of freedom: 506 Null deviance: 95.56748634029051 Residual deviance: 2.576614847985853 AIC: -1246.8038862031594
In [57]:
# 用来查看 H2O AutoML 所训练的所有模型的排行榜(leaderboard)
lb = automl.leaderboard
lb.head() # 查看排行榜中最靠前的几条记录(默认前 10 个模型)
Out[57]:
| model_id | rmse | mse | mae | rmsle | mean_residual_deviance |
|---|---|---|---|---|---|
| StackedEnsemble_AllModels_3_AutoML_1_20250325_05008 | 0.106797 | 0.0114057 | 0.0390613 | 0.0757921 | 0.0114057 |
| StackedEnsemble_AllModels_4_AutoML_1_20250325_05008 | 0.107562 | 0.0115696 | 0.039639 | 0.076378 | 0.0115696 |
| StackedEnsemble_AllModels_2_AutoML_1_20250325_05008 | 0.108256 | 0.0117194 | 0.0365698 | 0.0768915 | 0.0117194 |
| GBM_5_AutoML_1_20250325_05008 | 0.108337 | 0.0117368 | 0.0380686 | 0.0774662 | 0.0117368 |
| StackedEnsemble_BestOfFamily_4_AutoML_1_20250325_05008 | 0.108482 | 0.0117684 | 0.0384124 | 0.0774874 | 0.0117684 |
| StackedEnsemble_BestOfFamily_3_AutoML_1_20250325_05008 | 0.108506 | 0.0117735 | 0.0381567 | 0.0775736 | 0.0117735 |
| GBM_grid_1_AutoML_1_20250325_05008_model_10 | 0.111103 | 0.0123439 | 0.0379281 | 0.0789915 | 0.0123439 |
| GBM_grid_1_AutoML_1_20250325_05008_model_16 | 0.112089 | 0.0125639 | 0.0324281 | 0.0792078 | 0.0125639 |
| StackedEnsemble_AllModels_1_AutoML_1_20250325_05008 | 0.112311 | 0.0126137 | 0.0389786 | 0.0799503 | 0.0126137 |
| GBM_grid_1_AutoML_1_20250325_05008_model_4 | 0.112733 | 0.0127087 | 0.0417476 | 0.080158 | 0.0127087 |
[10 rows x 6 columns]
In [58]:
#数据格式转换、数据集划分、指定目标变量和特征列
# Переводим датафрейм в формат H2OFrame
# 把 Pandas DataFrame(比如 autoML_df)转换为 H2O 使用的专有数据格式 H2OFrame,这是 H2O 建模的输入格式
h2o_df = H2OFrame(autoML_df)
h2o_df['Healthy_Status'] = h2o_df['Healthy_Status'].asfactor()#这样 H2O 就会知道这是一个二分类问题,而不是连续值回归。
# Разбиение на обучающую и тестовую выборки
# 把 H2O 数据集 h2o_df 按照 85% : 15% 的比例,随机划分为训练集(train)和测试集(test)
random_seed = 17
train, test = h2o_df.split_frame(ratios=[0.85], seed=random_seed)
# Определяем целевую переменную и предикторы
# 指定 H2O 自动建模中的“目标变量”和“输入特征”
y="Healthy_Status" # 你要预测的字段(即分类或回归目标)
x = list(h2o_df.columns) # 获取所有列名
x.remove(y) # 把目标变量 y 从特征列表中移除,剩下的就是输入变量(predictors)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
In [59]:
# 自动训练多种机器学习模型并自动选择最佳模型,然后在测试集上评估性能
# Создаем и обучаем модель H2O AutoML
automl = H2OAutoML(max_runtime_secs=60, seed=random_seed, verbosity="info")
# 最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);最多训练 60 秒(控制时间,而不是模型数量);
#automl= H2OAutoML(max_models = 10, seed = 10, exclude_algos = ["StackedEnsemble", "DeepLearning"], verbosity="info", nfolds=0)
automl.train(x=x, y=y, training_frame=train)
# Оценка модели на тестовом наборе
performance = automl.leader.model_performance(test_data=test)
print(performance)
AutoML progress: |█
00:52:58.706: Project: AutoML_2_20250325_05258
00:52:58.707: 5-fold cross-validation will be used.
00:52:58.707: Setting stopping tolerance adaptively based on the training frame: 0.018464772811525407
00:52:58.707: Build control seed: 17
00:52:58.707: training frame: Frame key: AutoML_2_20250325_05258_training_py_10_sid_94de cols: 10 rows: 2933 chunks: 1 size: 69056 checksum: 120712254201011546
00:52:58.707: validation frame: NULL
00:52:58.707: leaderboard frame: NULL
00:52:58.707: blending frame: NULL
00:52:58.707: response column: Healthy_Status
00:52:58.707: fold column: null
00:52:58.707: weights column: null
00:52:58.707: AutoML: XGBoost is not available; skipping it.
00:52:58.707: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}, {DeepLearning : [def_1 (3g, 10w), grid_1 (4g, 30w), grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {completion : [resume_best_grids (10g, 60w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w), best_of_family_2 (2g, 5w), best_of_family_3 (3g, 5w), best_of_family_4 (4g, 5w), best_of_family_5 (5g, 5w), all_2 (2g, 10w), all_3 (3g, 10w), all_4 (4g, 10w), all_5 (5g, 10w), monotonic (6g, 10w), best_of_family_gbm (6g, 10w), all_gbm (7g, 10w), best_of_family_xglm (8g, 10w), all_xglm (8g, 10w), best_of_family (10g, 10w), best_N (10g, 10w)]}]
00:52:58.708: Disabling Algo: XGBoost as requested by the user.
00:52:58.708: AutoML job created: 2025.03.25 00:52:58.706
00:52:58.709: AutoML build started: 2025.03.25 00:52:58.709
00:52:58.710: AutoML: starting GLM_1_AutoML_2_20250325_05258 model training
00:52:59.230: New leader: GLM_1_AutoML_2_20250325_05258, auc: 0.9848418510928573
00:52:59.233: AutoML: starting GBM_1_AutoML_2_20250325_05258 model training
00:53:00.955: New leader: GBM_1_AutoML_2_20250325_05258, auc: 0.9964776833368534
00:53:00.956: AutoML: starting StackedEnsemble_BestOfFamily_1_AutoML_2_20250325_05258 model training
00:53:01.126: AutoML: starting DRF_1_AutoML_2_20250325_05258 model training
00:53:01.885: AutoML: starting GBM_2_AutoML_2_20250325_05258 model training
00:53:02.768: New leader: GBM_2_AutoML_2_20250325_05258, auc: 0.9970530816346334
00:53:02.768: AutoML: starting GBM_3_AutoML_2_20250325_05258 model training
██
00:53:03.735: AutoML: starting GBM_4_AutoML_2_20250325_05258 model training
00:53:04.748: AutoML: starting StackedEnsemble_BestOfFamily_2_AutoML_2_20250325_05258 model training
00:53:05.11: AutoML: starting StackedEnsemble_AllModels_1_AutoML_2_20250325_05258 model training
00:53:05.166: AutoML: starting XRT_1_AutoML_2_20250325_05258 model training
00:53:05.679: AutoML: starting GBM_5_AutoML_2_20250325_05258 model training
00:53:06.581: New leader: GBM_5_AutoML_2_20250325_05258, auc: 0.9973356432987219
00:53:06.581: AutoML: starting DeepLearning_1_AutoML_2_20250325_05258 model training
00:53:07.3: AutoML: starting StackedEnsemble_BestOfFamily_3_AutoML_2_20250325_05258 model training
00:53:07.279: AutoML: starting StackedEnsemble_AllModels_2_AutoML_2_20250325_05258 model training
00:53:07.558: AutoML: starting GBM_grid_1_AutoML_2_20250325_05258 hyperparameter search
███████████████████████████
00:53:27.858: New leader: GBM_grid_1_AutoML_2_20250325_05258_model_10, auc: 0.9973992910270975
████████
00:53:36.928: AutoML: starting DeepLearning_grid_1_AutoML_2_20250325_05258 hyperparameter search
██████████████
00:53:51.460: AutoML: starting StackedEnsemble_BestOfFamily_4_AutoML_2_20250325_05258 model training
00:53:51.743: AutoML: starting StackedEnsemble_AllModels_3_AutoML_2_20250325_05258 model training
█████
00:53:52.72: AutoML: starting DeepLearning_grid_2_AutoML_2_20250325_05258 hyperparameter search
00:53:54.829: AutoML: starting DeepLearning_grid_3_AutoML_2_20250325_05258 hyperparameter search
█████
00:53:57.690: AutoML: starting StackedEnsemble_AllModels_4_AutoML_2_20250325_05258 model training
00:53:58.74: Retraining best GBM with learning rate annealing: GBM_grid_1_AutoML_2_20250325_05258_model_10
00:53:58.74: AutoML: starting GBM_lr_annealing_selection_AutoML_2_20250325_05258_select_model model training
00:53:59.983: Actual modeling steps: [{GLM : [def_1 (1g, 10w)]}, {GBM : [def_5 (1g, 10w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w)]}, {DRF : [def_1 (2g, 10w)]}, {GBM : [def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w)]}, {StackedEnsemble : [best_of_family_2 (2g, 5w), all_2 (2g, 10w)]}, {DRF : [XRT (3g, 10w)]}, {GBM : [def_1 (3g, 10w)]}, {DeepLearning : [def_1 (3g, 10w)]}, {StackedEnsemble : [best_of_family_3 (3g, 5w), all_3 (3g, 10w)]}, {GBM : [grid_1 (4g, 60w)]}, {DeepLearning : [grid_1 (4g, 30w)]}, {StackedEnsemble : [best_of_family_4 (4g, 5w), all_4 (4g, 10w)]}, {DeepLearning : [grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {StackedEnsemble : [all_5 (5g, 10w)]}, {GBM : [lr_annealing (6g, 10w)]}]
00:53:59.983: AutoML build stopped: 2025.03.25 00:53:59.983
00:53:59.983: AutoML build done: built 35 models
00:53:59.983: AutoML duration: 1 min 1.274 sec
█| (done) 100%
ModelMetricsBinomial: gbm
** Reported on test data. **
MSE: 0.0056893233587090985
RMSE: 0.07542760342678997
LogLoss: 0.01841294346956109
Mean Per-Class Error: 0.0038461538461538464
AUC: 0.9998153846153847
AUCPR: 0.9994308033611844
Gini: 0.9996307692307693
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1346544244507371
0 1 Error Rate
----- --- --- ------- -----------
0 387 3 0.0077 (3.0/390.0)
1 0 125 0 (0.0/125.0)
Total 387 128 0.0058 (3.0/515.0)
Maximum Metrics: Maximum metrics at their respective thresholds
metric threshold value idx
--------------------------- ----------- -------- -----
max f1 0.134654 0.988142 115
max f2 0.134654 0.995223 115
max f0point5 0.666699 0.995106 109
max accuracy 0.666699 0.994175 109
max precision 0.999298 1 0
max recall 0.134654 1 115
max specificity 0.999298 1 0
max absolute_mcc 0.134654 0.984404 115
max min_per_class_accuracy 0.134654 0.992308 115
max mean_per_class_accuracy 0.134654 0.996154 115
max tns 0.999298 390 0
max fns 0.999298 123 0
max fps 0.000157684 390 399
max tps 0.134654 125 115
max tnr 0.999298 1 0
max fnr 0.999298 0.984 0
max fpr 0.000157684 1 399
max tpr 0.134654 1 115
Gains/Lift Table: Avg response rate: 24.27 %, avg score: 23.96 %
group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov
------- -------------------------- ----------------- ------- ----------------- --------------- ----------- -------------------------- ------------------ -------------- ------------------------- ------- ----------------- --------------------
1 0.0116505 0.999053 4.12 4.12 1 0.999143 1 0.999143 0.048 0.048 312 312 0.048
2 0.0213592 0.999 4.12 4.12 1 0.999011 1 0.999083 0.04 0.088 312 312 0.088
3 0.031068 0.998884 4.12 4.12 1 0.99894 1 0.999038 0.04 0.128 312 312 0.128
4 0.0407767 0.998811 4.12 4.12 1 0.998846 1 0.998993 0.04 0.168 312 312 0.168
5 0.0504854 0.998726 4.12 4.12 1 0.998765 1 0.998949 0.04 0.208 312 312 0.208
6 0.100971 0.998496 4.12 4.12 1 0.998611 1 0.99878 0.208 0.416 312 312 0.416
7 0.151456 0.998089 4.12 4.12 1 0.998334 1 0.998631 0.208 0.624 312 312 0.624
8 0.2 0.995943 4.12 4.12 1 0.997439 1 0.998342 0.2 0.824 312 312 0.824
9 0.300971 0.00554024 1.74308 3.32258 0.423077 0.386794 0.806452 0.793177 0.176 1 74.3077 232.258 0.923077
10 0.4 0.00237039 0 2.5 0 0.0038733 0.606796 0.597767 0 1 -100 150 0.792308
11 0.500971 0.00132393 0 1.99612 0 0.00177338 0.484496 0.477644 0 1 -100 99.6124 0.658974
12 0.6 0.000788305 0 1.66667 0 0.00102156 0.404531 0.398979 0 1 -100 66.6667 0.528205
13 0.699029 0.000568038 0 1.43056 0 0.000656848 0.347222 0.34255 0 1 -100 43.0556 0.397436
14 0.8 0.000400525 0 1.25 0 0.000481929 0.303398 0.299376 0 1 -100 25 0.264103
15 0.899029 0.000267807 0 1.11231 0 0.000330957 0.269978 0.266436 0 1 -100 11.2311 0.133333
16 1 0.000157363 0 1 0 0.000217108 0.242718 0.239556 0 1 -100 0 0
In [60]:
# 用来查看 H2O AutoML 所训练的所有模型的排行榜(leaderboard)
lb = automl.leaderboard
lb.head() # 查看排行榜中最靠前的几条记录(默认前 10 个模型)
Out[60]:
| model_id | auc | logloss | aucpr | mean_per_class_error | rmse | mse |
|---|---|---|---|---|---|---|
| GBM_grid_1_AutoML_2_20250325_05258_model_10 | 0.997399 | 0.0456575 | 0.995637 | 0.0221437 | 0.109083 | 0.011899 |
| GBM_5_AutoML_2_20250325_05258 | 0.997336 | 0.0438969 | 0.995712 | 0.0225102 | 0.107085 | 0.0114673 |
| GBM_grid_1_AutoML_2_20250325_05258_model_5 | 0.997303 | 0.0421013 | 0.995893 | 0.0228707 | 0.104793 | 0.0109815 |
| StackedEnsemble_AllModels_3_AutoML_2_20250325_05258 | 0.997289 | 0.0395168 | 0.996057 | 0.0181904 | 0.100036 | 0.0100072 |
| StackedEnsemble_AllModels_4_AutoML_2_20250325_05258 | 0.997216 | 0.0394978 | 0.995998 | 0.0177078 | 0.0999224 | 0.00998449 |
| GBM_grid_1_AutoML_2_20250325_05258_model_4 | 0.997174 | 0.0419 | 0.995587 | 0.0205916 | 0.10349 | 0.0107102 |
| GBM_grid_1_AutoML_2_20250325_05258_model_14 | 0.997093 | 0.0450541 | 0.995276 | 0.020109 | 0.107151 | 0.0114814 |
| GBM_grid_1_AutoML_2_20250325_05258_model_6 | 0.997071 | 0.0475895 | 0.995481 | 0.0237078 | 0.10739 | 0.0115326 |
| GBM_2_AutoML_2_20250325_05258 | 0.997053 | 0.0466026 | 0.995157 | 0.023109 | 0.109112 | 0.0119054 |
| GBM_4_AutoML_2_20250325_05258 | 0.996991 | 0.0465359 | 0.995036 | 0.0213066 | 0.10774 | 0.0116078 |
[10 rows x 7 columns]
In [61]:
# AutoML 训练完成后生成的模型排行榜
model_ids = list(automl.leaderboard['model_id'].as_data_frame().iloc[:,0])
model_ids
D:\anaconda\Lib\site-packages\h2o\frame.py:1983: H2ODependencyWarning: Converting H2O frame to pandas dataframe using single-thread. For faster conversion using multi-thread, install polars and pyarrow and use it as pandas_df = h2o_df.as_data_frame(use_multi_thread=True)
warnings.warn("Converting H2O frame to pandas dataframe using single-thread. For faster conversion using"
Out[61]:
['GBM_grid_1_AutoML_2_20250325_05258_model_10', 'GBM_5_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_5', 'StackedEnsemble_AllModels_3_AutoML_2_20250325_05258', 'StackedEnsemble_AllModels_4_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_4', 'GBM_grid_1_AutoML_2_20250325_05258_model_14', 'GBM_grid_1_AutoML_2_20250325_05258_model_6', 'GBM_2_AutoML_2_20250325_05258', 'GBM_4_AutoML_2_20250325_05258', 'StackedEnsemble_BestOfFamily_4_AutoML_2_20250325_05258', 'StackedEnsemble_BestOfFamily_3_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_9', 'GBM_3_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_3', 'StackedEnsemble_AllModels_2_AutoML_2_20250325_05258', 'StackedEnsemble_AllModels_1_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_15', 'GBM_grid_1_AutoML_2_20250325_05258_model_2', 'GBM_1_AutoML_2_20250325_05258', 'StackedEnsemble_BestOfFamily_2_AutoML_2_20250325_05258', 'StackedEnsemble_BestOfFamily_1_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_13', 'GBM_grid_1_AutoML_2_20250325_05258_model_8', 'GBM_grid_1_AutoML_2_20250325_05258_model_11', 'GBM_grid_1_AutoML_2_20250325_05258_model_12', 'GBM_grid_1_AutoML_2_20250325_05258_model_1', 'XRT_1_AutoML_2_20250325_05258', 'DRF_1_AutoML_2_20250325_05258', 'GBM_grid_1_AutoML_2_20250325_05258_model_16', 'GBM_grid_1_AutoML_2_20250325_05258_model_7', 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_1', 'DeepLearning_grid_2_AutoML_2_20250325_05258_model_1', 'DeepLearning_1_AutoML_2_20250325_05258', 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_2', 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_3', 'DeepLearning_grid_3_AutoML_2_20250325_05258_model_1', 'DeepLearning_grid_2_AutoML_2_20250325_05258_model_2', 'DeepLearning_grid_3_AutoML_2_20250325_05258_model_2', 'DeepLearning_grid_1_AutoML_2_20250325_05258_model_4', 'DeepLearning_grid_3_AutoML_2_20250325_05258_model_3', 'GLM_1_AutoML_2_20250325_05258', 'DeepLearning_grid_2_AutoML_2_20250325_05258_model_3']
In [66]:
# 从 AutoML 模型排行榜中筛选出一个 XGBoost 模型,并将其 H2O 格式的参数转换为原生 XGBoost 可识别的格式
out = h2o.get_model([mid for mid in model_ids if "XGBoost" in mid][0])
out.convert_H2OXGBoostParams_2_XGBoostParams()
# H2O 现在的后端确实支持 XGBoost(所以你能创建 H2OXGBoostEstimator()),
# 但在这一次 AutoML 过程中,并没有实际训练出 XGBoost 模型,从而排行榜里根本没有以 “XGBoost_...” 命名的模型。
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[66], line 2 1 # 从 AutoML 模型排行榜中筛选出一个 XGBoost 模型,并将其 H2O 格式的参数转换为原生 XGBoost 可识别的格式 ----> 2 out = h2o.get_model([mid for mid in model_ids if "XGBoost" in mid][0]) 3 out.convert_H2OXGBoostParams_2_XGBoostParams() IndexError: list index out of range
In [63]:
from h2o.estimators import H2OXGBoostEstimator
try:
m = H2OXGBoostEstimator()
print("✅ XGBoost 可用")
except Exception as e:
print("❌ XGBoost 不可用:", e)
✅ XGBoost 可用
In [65]:
# 直接训练 XGBoost 模型
import h2o
from h2o.estimators import H2OXGBoostEstimator
# 如果还没连接到手动启动的 H2O
h2o.init(ip="localhost", port=54321) # start_h2o=False 如果已经有集群在运行
# 假设你已经有训练集 train_h2o (类型是 H2OFrame),x为特征列表,y为目标列
# 例:
# x = ["col1", "col2", ...]
# y = "target"
xgb = H2OXGBoostEstimator(
ntrees=100,
max_depth=5,
learn_rate=0.1,
seed=42
)
xgb.train(x=x, y=y, training_frame=train_h2o)
# 评估模型
perf = xgb.model_performance(valid=True) # 如果有验证集的话
print(perf)
Checking whether there is an H2O instance running at http://localhost:54321. connected. Warning: Your H2O cluster version is (4 months and 22 days) old. There may be a newer version available. Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
| H2O_cluster_uptime: | 11 mins 42 secs |
| H2O_cluster_timezone: | Europe/Moscow |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.46.0.6 |
| H2O_cluster_version_age: | 4 months and 22 days |
| H2O_cluster_name: | H2O_from_python_25150_atuyo6 |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 7.745 Gb |
| H2O_cluster_total_cores: | 32 |
| H2O_cluster_allowed_cores: | 32 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://localhost:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.11.5 final |
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[65], line 19 8 # 假设你已经有训练集 train_h2o (类型是 H2OFrame),x为特征列表,y为目标列 9 # 例: 10 # x = ["col1", "col2", ...] 11 # y = "target" 13 xgb = H2OXGBoostEstimator( 14 ntrees=100, 15 max_depth=5, 16 learn_rate=0.1, 17 seed=42 18 ) ---> 19 xgb.train(x=x, y=y, training_frame=train_h2o) 21 # 评估模型 22 perf = xgb.model_performance(valid=True) # 如果有验证集的话 NameError: name 'train_h2o' is not defined
In [64]:
# 在 AutoML 中启用 XGBoost
from h2o.automl import H2OAutoML
automl = H2OAutoML(
max_runtime_secs=60,
seed=42,
include_algos=["XGBoost", "GBM", "GLM", "DRF", "DeepLearning", "StackedEnsemble"],
verbosity="info"
)
automl.train(x=x, y=y, training_frame=train_h2o)
lb = automl.leaderboard
lb.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[64], line 10 2 from h2o.automl import H2OAutoML 4 automl = H2OAutoML( 5 max_runtime_secs=60, 6 seed=42, 7 include_algos=["XGBoost", "GBM", "GLM", "DRF", "DeepLearning", "StackedEnsemble"], 8 verbosity="info" 9 ) ---> 10 automl.train(x=x, y=y, training_frame=train_h2o) 12 lb = automl.leaderboard 13 lb.head() NameError: name 'train_h2o' is not defined
In [60]:
xgb_ids = [mid for mid in model_ids if "XGBoost" in mid]
print(xgb_ids) # 如果是空的,说明没有 XGBoost 模型
[]
In [61]:
if xgb_ids:
out = h2o.get_model(xgb_ids[0])
params = out.convert_H2OXGBoostParams_2_XGBoostParams()
print(params)
else:
print("⚠️ 当前 AutoML 没有训练出 XGBoost 模型。")
⚠️ 当前 AutoML 没有训练出 XGBoost 模型。
In [62]:
import h2o
from h2o.estimators import H2OXGBoostEstimator
h2o.init()
# 检查是否可以创建 H2O 的 XGBoost 模型
try:
model = H2OXGBoostEstimator()
print("✅ 当前环境支持 H2O 的 XGBoost 模型。")
except Exception as e:
print("❌ 当前环境不支持 H2O 的 XGBoost,原因:", e)
Checking whether there is an H2O instance running at http://localhost:54321. connected. Warning: Your H2O cluster version is (4 months and 21 days) old. There may be a newer version available. Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
| H2O_cluster_uptime: | 11 hours 21 mins |
| H2O_cluster_timezone: | Europe/Moscow |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.46.0.6 |
| H2O_cluster_version_age: | 4 months and 21 days |
| H2O_cluster_name: | H2O_from_python_25150_lkcgsm |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 7.331 Gb |
| H2O_cluster_total_cores: | 32 |
| H2O_cluster_allowed_cores: | 32 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://localhost:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.11.5 final |
✅ 当前环境支持 H2O 的 XGBoost 模型。
In [81]:
automl = H2OAutoML(
max_runtime_secs=120,
seed=42,
include_algos=["XGBoost", "GBM", "GLM", "DRF", "DeepLearning", "StackedEnsemble"]
)
from h2o.estimators import H2OXGBoostEstimator
xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
xgb.train(x=x, y=y, training_frame=train)
xgb.varimp_plot()
xgb.convert_H2OXGBoostParams_2_XGBoostParams()
--------------------------------------------------------------------------- H2OResponseError Traceback (most recent call last) Cell In[81], line 9 6 from h2o.estimators import H2OXGBoostEstimator 8 xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42) ----> 9 xgb.train(x=x, y=y, training_frame=train) 10 xgb.varimp_plot() 11 xgb.convert_H2OXGBoostParams_2_XGBoostParams() File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:107, in H2OEstimator.train(self, x, y, training_frame, offset_column, fold_column, weights_column, validation_frame, max_runtime_secs, ignored_columns, model_id, verbose) 88 """ 89 Train the H2O model. 90 (...) 101 :param bool verbose: Print scoring history to stdout. Defaults to False. 102 """ 103 parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, 104 fold_column=fold_column, weights_column=weights_column, 105 validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, 106 ignored_columns=ignored_columns, model_id=model_id, verbose=verbose) --> 107 self._train(parms, verbose=verbose) 108 return self File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:186, in H2OEstimator._train(self, parms, verbose) 183 assert_is_type(verbose, bool) 185 rest_ver = self._get_rest_version(parms) --> 186 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) 187 job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) 189 if model_builder_json["messages"] is not None: File D:\anaconda\Lib\site-packages\h2o\h2o.py:123, in api(endpoint, data, json, filename, save_to) 121 # type checks are performed in H2OConnection class 122 _check_connection() --> 123 return h2oconn.request(endpoint, data=data, json=json, filename=filename, save_to=save_to) File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:499, in H2OConnection.request(self, endpoint, data, json, filename, save_to) 497 save_to = save_to(resp) 498 self._log_end_transaction(start_time, resp) --> 499 return self._process_response(resp, save_to) 501 except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e: 502 if self._local_server and not self._local_server.is_running(): File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:853, in H2OConnection._process_response(response, save_to) 851 if status_code in {400, 404, 412} and isinstance(data, H2OErrorV3): 852 data.show_stacktrace = False --> 853 raise H2OResponseError(data) 855 # Server errors (notably 500 = "Server Error") 856 # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server 857 # did not provide the correct status code. 858 raise H2OServerError("HTTP %d %s:\n%s" % (status_code, response.reason, data)) H2OResponseError: Server error water.exceptions.H2ONotFoundArgumentException: Error: POST /3/ModelBuilders/xgboost not found Request: POST /3/ModelBuilders/xgboost data: {'training_frame': 'py_37_sid_add6', 'nfolds': '0', 'keep_cross_validation_models': 'True', 'keep_cross_validation_predictions': 'False', 'keep_cross_validation_fold_assignment': 'False', 'score_each_iteration': 'False', 'fold_assignment': 'auto', 'response_column': 'Healthy_Status', 'ignore_const_cols': 'True', 'stopping_rounds': '0', 'stopping_metric': 'auto', 'stopping_tolerance': '0.001', 'max_runtime_secs': '0.0', 'seed': '42', 'distribution': 'auto', 'tweedie_power': '1.5', 'categorical_encoding': 'auto', 'quiet_mode': 'True', 'ntrees': '100', 'max_depth': '5', 'min_rows': '1.0', 'min_child_weight': '1.0', 'learn_rate': '0.1', 'eta': '0.3', 'sample_rate': '1.0', 'subsample': '1.0', 'col_sample_rate': '1.0', 'colsample_bylevel': '1.0', 'col_sample_rate_per_tree': '1.0', 'colsample_bytree': '1.0', 'colsample_bynode': '1.0', 'max_abs_leafnode_pred': '0.0', 'max_delta_step': '0.0', 'score_tree_interval': '0', 'min_split_improvement': '0.0', 'gamma': '0.0', 'nthread': '-1', 'build_tree_one_node': 'False', 'parallelize_cross_validation': 'True', 'calibrate_model': 'False', 'calibration_method': 'auto', 'max_bins': '256', 'max_leaves': '0', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': '0.0', 'one_drop': 'False', 'skip_drop': '0.0', 'tree_method': 'auto', 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_lambda': '1.0', 'reg_alpha': '0.0', 'dmatrix_type': 'auto', 'backend': 'auto', 'gainslift_bins': '-1', 'auc_type': 'auto', 'scale_pos_weight': '1.0', 'score_eval_metric_only': 'False'}
In [63]:
# 自动检测并启用 XGBoost(如果可用),否则跳过。
from h2o.automl import H2OAutoML
automl = H2OAutoML(
max_runtime_secs=60,
seed=42,
verbosity="info"
)
automl.train(x=x, y=y, training_frame=train)
AutoML progress: |█
01:27:50.94: Project: AutoML_10_20250324_12750
01:27:50.94: 5-fold cross-validation will be used.
01:27:50.94: Setting stopping tolerance adaptively based on the training frame: 0.018464772811525407
01:27:50.94: Build control seed: 42
01:27:50.94: training frame: Frame key: AutoML_10_20250324_12750_training_py_10_sid_baac cols: 10 rows: 2933 chunks: 1 size: 69060 checksum: -6527674322480268147
01:27:50.94: validation frame: NULL
01:27:50.94: leaderboard frame: NULL
01:27:50.94: blending frame: NULL
01:27:50.94: response column: Healthy_Status
01:27:50.94: fold column: null
01:27:50.94: weights column: null
01:27:50.94: AutoML: XGBoost is not available; skipping it.
01:27:50.94: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (6g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (6g, 10w)]}, {DeepLearning : [def_1 (3g, 10w), grid_1 (4g, 30w), grid_2 (5g, 30w), grid_3 (5g, 30w)]}, {completion : [resume_best_grids (10g, 60w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w), best_of_family_2 (2g, 5w), best_of_family_3 (3g, 5w), best_of_family_4 (4g, 5w), best_of_family_5 (5g, 5w), all_2 (2g, 10w), all_3 (3g, 10w), all_4 (4g, 10w), all_5 (5g, 10w), monotonic (6g, 10w), best_of_family_gbm (6g, 10w), all_gbm (7g, 10w), best_of_family_xglm (8g, 10w), all_xglm (8g, 10w), best_of_family (10g, 10w), best_N (10g, 10w)]}]
01:27:50.95: Disabling Algo: XGBoost as requested by the user.
01:27:50.95: AutoML job created: 2025.03.24 01:27:50.94
01:27:50.102: AutoML build started: 2025.03.24 01:27:50.98
01:27:50.104: AutoML: starting GLM_1_AutoML_10_20250324_12750 model training
01:27:50.238: New leader: GLM_1_AutoML_10_20250324_12750, auc: 0.9868937054394548
01:27:50.238: AutoML: starting GBM_1_AutoML_10_20250324_12750 model training
01:27:51.228: New leader: GBM_1_AutoML_10_20250324_12750, auc: 0.996292163052351
01:27:51.229: AutoML: starting StackedEnsemble_BestOfFamily_1_AutoML_10_20250324_12750 model training
01:27:51.430: AutoML: starting DRF_1_AutoML_10_20250324_12750 model training
01:27:51.845: AutoML: starting GBM_2_AutoML_10_20250324_12750 model training
01:27:52.701: New leader: GBM_2_AutoML_10_20250324_12750, auc: 0.9971375647180377
01:27:52.701: AutoML: starting GBM_3_AutoML_10_20250324_12750 model training
01:27:53.713: New leader: GBM_3_AutoML_10_20250324_12750, auc: 0.9974458138061342
01:27:53.714: AutoML: starting GBM_4_AutoML_10_20250324_12750 model training
██
01:27:54.775: AutoML: starting StackedEnsemble_BestOfFamily_2_AutoML_10_20250324_12750 model training
01:27:54.936: New leader: StackedEnsemble_BestOfFamily_2_AutoML_10_20250324_12750, auc: 0.9975342927110507
01:27:54.937: AutoML: starting StackedEnsemble_AllModels_1_AutoML_10_20250324_12750 model training
01:27:55.96: AutoML: starting XRT_1_AutoML_10_20250324_12750 model training
01:27:55.562: AutoML: starting GBM_5_AutoML_10_20250324_12750 model training
01:27:56.274: AutoML: starting DeepLearning_1_AutoML_10_20250324_12750 model training
01:27:56.546: AutoML: starting StackedEnsemble_BestOfFamily_3_AutoML_10_20250324_12750 model training
01:27:56.704: AutoML: starting StackedEnsemble_AllModels_2_AutoML_10_20250324_12750 model training
01:27:56.865: AutoML: starting GBM_grid_1_AutoML_10_20250324_12750 hyperparameter search
████████████████████████
01:28:12.91: New leader: GBM_grid_1_AutoML_10_20250324_12750_model_9, auc: 0.9976005091818271
███████████
01:28:27.329: AutoML: starting DeepLearning_grid_1_AutoML_10_20250324_12750 hyperparameter search
████████████████
01:28:42.748: AutoML: starting StackedEnsemble_BestOfFamily_4_AutoML_10_20250324_12750 model training
01:28:43.61: AutoML: starting StackedEnsemble_AllModels_3_AutoML_10_20250324_12750 model training
01:28:43.408: New leader: StackedEnsemble_AllModels_3_AutoML_10_20250324_12750, auc: 0.9979652706027412
01:28:43.410: AutoML: starting DeepLearning_grid_2_AutoML_10_20250324_12750 hyperparameter search
█████
01:28:47.390: AutoML: starting DeepLearning_grid_3_AutoML_10_20250324_12750 hyperparameter search
████| (done) 100%
01:28:51.15: Actual modeling steps: [{GLM : [def_1 (1g, 10w)]}, {GBM : [def_5 (1g, 10w)]}, {StackedEnsemble : [best_of_family_1 (1g, 5w)]}, {DRF : [def_1 (2g, 10w)]}, {GBM : [def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w)]}, {StackedEnsemble : [best_of_family_2 (2g, 5w), all_2 (2g, 10w)]}, {DRF : [XRT (3g, 10w)]}, {GBM : [def_1 (3g, 10w)]}, {DeepLearning : [def_1 (3g, 10w)]}, {StackedEnsemble : [best_of_family_3 (3g, 5w), all_3 (3g, 10w)]}, {GBM : [grid_1 (4g, 60w)]}, {DeepLearning : [grid_1 (4g, 30w)]}, {StackedEnsemble : [best_of_family_4 (4g, 5w), all_4 (4g, 10w)]}, {DeepLearning : [grid_2 (5g, 30w), grid_3 (5g, 30w)]}]
01:28:51.15: AutoML build stopped: 2025.03.24 01:28:51.15
01:28:51.15: AutoML build done: built 34 models
01:28:51.15: AutoML duration: 1 min 0.917 sec
Out[63]:
Model Details ============= H2OStackedEnsembleEstimator : Stacked Ensemble Model Key: StackedEnsemble_AllModels_3_AutoML_10_20250324_12750
| key | value |
|---|---|
| Stacking strategy | cross_validation |
| Number of base models (used / total) | 6/31 |
| # GBM base models (used / total) | 4/23 |
| # DRF base models (used / total) | 2/2 |
| # DeepLearning base models (used / total) | 0/5 |
| # GLM base models (used / total) | 0/1 |
| Metalearner algorithm | GLM |
| Metalearner fold assignment scheme | Random |
| Metalearner nfolds | 5 |
| Metalearner fold_column | None |
| Custom metalearner hyperparameters | None |
ModelMetricsBinomialGLM: stackedensemble ** Reported on train data. ** MSE: 9.240592103219748e-05 RMSE: 0.009612799853955011 LogLoss: 0.0029767531608037834 AUC: 1.0 AUCPR: 1.0 Gini: 1.0 Null degrees of freedom: 2932 Residual degrees of freedom: 2926 Null deviance: 3503.936615660394 Residual deviance: 17.46163404127499 AIC: 31.46163404127499
| 0 | 1 | Error | Rate | |
|---|---|---|---|---|
| 0 | 2098.0 | 0.0 | 0.0 | (0.0/2098.0) |
| 1 | 0.0 | 835.0 | 0.0 | (0.0/835.0) |
| Total | 2098.0 | 835.0 | 0.0 | (0.0/2933.0) |
| metric | threshold | value | idx |
|---|---|---|---|
| max f1 | 0.9322314 | 1.0 | 122.0 |
| max f2 | 0.9322314 | 1.0 | 122.0 |
| max f0point5 | 0.9322314 | 1.0 | 122.0 |
| max accuracy | 0.9322314 | 1.0 | 122.0 |
| max precision | 0.9998620 | 1.0 | 0.0 |
| max recall | 0.9322314 | 1.0 | 122.0 |
| max specificity | 0.9998620 | 1.0 | 0.0 |
| max absolute_mcc | 0.9322314 | 1.0 | 122.0 |
| max min_per_class_accuracy | 0.9322314 | 1.0 | 122.0 |
| max mean_per_class_accuracy | 0.9322314 | 1.0 | 122.0 |
| max tns | 0.9998620 | 2098.0 | 0.0 |
| max fns | 0.9998620 | 822.0 | 0.0 |
| max fps | 0.0000647 | 2098.0 | 399.0 |
| max tps | 0.9322314 | 835.0 | 122.0 |
| max tnr | 0.9998620 | 1.0 | 0.0 |
| max fnr | 0.9998620 | 0.9844311 | 0.0 |
| max fpr | 0.0000647 | 1.0 | 399.0 |
| max tpr | 0.9322314 | 1.0 | 122.0 |
| group | cumulative_data_fraction | lower_threshold | lift | cumulative_lift | response_rate | score | cumulative_response_rate | cumulative_score | capture_rate | cumulative_capture_rate | gain | cumulative_gain | kolmogorov_smirnov |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0102284 | 0.9998396 | 3.5125749 | 3.5125749 | 1.0 | 0.9998528 | 1.0 | 0.9998528 | 0.0359281 | 0.0359281 | 251.2574850 | 251.2574850 | 0.0359281 |
| 2 | 0.0201159 | 0.9998267 | 3.5125749 | 3.5125749 | 1.0 | 0.9998318 | 1.0 | 0.9998425 | 0.0347305 | 0.0706587 | 251.2574850 | 251.2574850 | 0.0706587 |
| 3 | 0.0300034 | 0.9998215 | 3.5125749 | 3.5125749 | 1.0 | 0.9998242 | 1.0 | 0.9998365 | 0.0347305 | 0.1053892 | 251.2574850 | 251.2574850 | 0.1053892 |
| 4 | 0.0402318 | 0.9998150 | 3.5125749 | 3.5125749 | 1.0 | 0.9998181 | 1.0 | 0.9998318 | 0.0359281 | 0.1413174 | 251.2574850 | 251.2574850 | 0.1413174 |
| 5 | 0.0501193 | 0.9998089 | 3.5125749 | 3.5125749 | 1.0 | 0.9998116 | 1.0 | 0.9998278 | 0.0347305 | 0.1760479 | 251.2574850 | 251.2574850 | 0.1760479 |
| 6 | 0.1002387 | 0.9997828 | 3.5125749 | 3.5125749 | 1.0 | 0.9997954 | 1.0 | 0.9998116 | 0.1760479 | 0.3520958 | 251.2574850 | 251.2574850 | 0.3520958 |
| 7 | 0.1500170 | 0.9997479 | 3.5125749 | 3.5125749 | 1.0 | 0.9997668 | 1.0 | 0.9997968 | 0.1748503 | 0.5269461 | 251.2574850 | 251.2574850 | 0.5269461 |
| 8 | 0.2001364 | 0.9994625 | 3.5125749 | 3.5125749 | 1.0 | 0.9996776 | 1.0 | 0.9997669 | 0.1760479 | 0.7029940 | 251.2574850 | 251.2574850 | 0.7029940 |
| 9 | 0.3000341 | 0.0338779 | 2.9731009 | 3.3329545 | 0.8464164 | 0.8489891 | 0.9488636 | 0.9495647 | 0.2970060 | 1.0 | 197.3100897 | 233.2954545 | 0.9785510 |
| 10 | 0.3999318 | 0.0028213 | 0.0 | 2.5004263 | 0.0 | 0.0102039 | 0.7118500 | 0.7149247 | 0.0 | 1.0 | -100.0 | 150.0426257 | 0.8388942 |
| 11 | 0.5001705 | 0.0011241 | 0.0 | 1.9993183 | 0.0 | 0.0016506 | 0.5691888 | 0.5719782 | 0.0 | 1.0 | -100.0 | 99.9318337 | 0.6987607 |
| 12 | 0.6000682 | 0.0006141 | 0.0 | 1.6664773 | 0.0 | 0.0008402 | 0.4744318 | 0.4768967 | 0.0 | 1.0 | -100.0 | 66.6477273 | 0.5591039 |
| 13 | 0.6999659 | 0.0003816 | 0.0 | 1.4286410 | 0.0 | 0.0004884 | 0.4067219 | 0.4089047 | 0.0 | 1.0 | -100.0 | 42.8641013 | 0.4194471 |
| 14 | 0.7998636 | 0.0002528 | 0.0 | 1.2502131 | 0.0 | 0.0003136 | 0.3559250 | 0.3578743 | 0.0 | 1.0 | -100.0 | 25.0213129 | 0.2797903 |
| 15 | 0.8997613 | 0.0001431 | 0.0 | 1.1114058 | 0.0 | 0.0001954 | 0.3164077 | 0.3181623 | 0.0 | 1.0 | -100.0 | 11.1405836 | 0.1401335 |
| 16 | 1.0 | 0.0000548 | 0.0 | 1.0 | 0.0 | 0.0001039 | 0.2846914 | 0.2862806 | 0.0 | 1.0 | -100.0 | 0.0 | 0.0 |
ModelMetricsBinomialGLM: stackedensemble ** Reported on cross-validation data. ** MSE: 0.011684883939393473 RMSE: 0.10809664166565709 LogLoss: 0.04218696916105611 AUC: 0.9979652706027412 AUCPR: 0.9961374399921781 Gini: 0.9959305412054824 Null degrees of freedom: 2932 Residual degrees of freedom: 2924 Null deviance: 3505.305034144737 Residual deviance: 247.46876109875512 AIC: 265.4687610987551
| 0 | 1 | Error | Rate | |
|---|---|---|---|---|
| 0 | 2095.0 | 3.0 | 0.0014 | (3.0/2098.0) |
| 1 | 42.0 | 793.0 | 0.0503 | (42.0/835.0) |
| Total | 2137.0 | 796.0 | 0.0153 | (45.0/2933.0) |
| metric | threshold | value | idx |
|---|---|---|---|
| max f1 | 0.7174627 | 0.9724096 | 112.0 |
| max f2 | 0.1468996 | 0.9784156 | 189.0 |
| max f0point5 | 0.7174627 | 0.9865638 | 112.0 |
| max accuracy | 0.7174627 | 0.9846573 | 112.0 |
| max precision | 0.9999199 | 1.0 | 0.0 |
| max recall | 0.0028657 | 1.0 | 365.0 |
| max specificity | 0.9999199 | 1.0 | 0.0 |
| max absolute_mcc | 0.7174627 | 0.9623200 | 112.0 |
| max min_per_class_accuracy | 0.2379595 | 0.9808383 | 169.0 |
| max mean_per_class_accuracy | 0.1642521 | 0.9819737 | 185.0 |
| max tns | 0.9999199 | 2098.0 | 0.0 |
| max fns | 0.9999199 | 780.0 | 0.0 |
| max fps | 0.0000287 | 2098.0 | 399.0 |
| max tps | 0.0028657 | 835.0 | 365.0 |
| max tnr | 0.9999199 | 1.0 | 0.0 |
| max fnr | 0.9999199 | 0.9341317 | 0.0 |
| max fpr | 0.0000287 | 1.0 | 399.0 |
| max tpr | 0.0028657 | 1.0 | 365.0 |
| group | cumulative_data_fraction | lower_threshold | lift | cumulative_lift | response_rate | score | cumulative_response_rate | cumulative_score | capture_rate | cumulative_capture_rate | gain | cumulative_gain | kolmogorov_smirnov |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0102284 | 0.9999180 | 3.5125749 | 3.5125749 | 1.0 | 0.9999352 | 1.0 | 0.9999352 | 0.0359281 | 0.0359281 | 251.2574850 | 251.2574850 | 0.0359281 |
| 2 | 0.0201159 | 0.9998868 | 3.5125749 | 3.5125749 | 1.0 | 0.9999013 | 1.0 | 0.9999185 | 0.0347305 | 0.0706587 | 251.2574850 | 251.2574850 | 0.0706587 |
| 3 | 0.0300034 | 0.9998575 | 3.5125749 | 3.5125749 | 1.0 | 0.9998702 | 1.0 | 0.9999026 | 0.0347305 | 0.1053892 | 251.2574850 | 251.2574850 | 0.1053892 |
| 4 | 0.0402318 | 0.9998225 | 3.5125749 | 3.5125749 | 1.0 | 0.9998415 | 1.0 | 0.9998871 | 0.0359281 | 0.1413174 | 251.2574850 | 251.2574850 | 0.1413174 |
| 5 | 0.0501193 | 0.9997939 | 3.5125749 | 3.5125749 | 1.0 | 0.9998088 | 1.0 | 0.9998716 | 0.0347305 | 0.1760479 | 251.2574850 | 251.2574850 | 0.1760479 |
| 6 | 0.1002387 | 0.9993996 | 3.5125749 | 3.5125749 | 1.0 | 0.9996169 | 1.0 | 0.9997443 | 0.1760479 | 0.3520958 | 251.2574850 | 251.2574850 | 0.3520958 |
| 7 | 0.1500170 | 0.9989825 | 3.5125749 | 3.5125749 | 1.0 | 0.9992138 | 1.0 | 0.9995683 | 0.1748503 | 0.5269461 | 251.2574850 | 251.2574850 | 0.5269461 |
| 8 | 0.2001364 | 0.9967890 | 3.5125749 | 3.5125749 | 1.0 | 0.9981442 | 1.0 | 0.9992116 | 0.1760479 | 0.7029940 | 251.2574850 | 251.2574850 | 0.7029940 |
| 9 | 0.3000341 | 0.1359474 | 2.8532178 | 3.2930389 | 0.8122867 | 0.7999760 | 0.9375 | 0.9328752 | 0.2850299 | 0.9880240 | 185.3217796 | 229.3038922 | 0.9618085 |
| 10 | 0.3999318 | 0.0113840 | 0.0719299 | 2.4884482 | 0.0204778 | 0.0379940 | 0.7084399 | 0.7093456 | 0.0071856 | 0.9952096 | -92.8070140 | 148.8448168 | 0.8321972 |
| 11 | 0.5001705 | 0.0037902 | 0.0358426 | 1.9969239 | 0.0102041 | 0.0065884 | 0.5685072 | 0.5685068 | 0.0035928 | 0.9988024 | -96.4157399 | 99.6923944 | 0.6970865 |
| 12 | 0.6000682 | 0.0016244 | 0.0119883 | 1.6664773 | 0.0034130 | 0.0025423 | 0.4744318 | 0.4742865 | 0.0011976 | 1.0 | -98.8011690 | 66.6477273 | 0.5591039 |
| 13 | 0.6999659 | 0.0008044 | 0.0 | 1.4286410 | 0.0 | 0.0011552 | 0.4067219 | 0.4067622 | 0.0 | 1.0 | -100.0 | 42.8641013 | 0.4194471 |
| 14 | 0.7998636 | 0.0003900 | 0.0 | 1.2502131 | 0.0 | 0.0005843 | 0.3559250 | 0.3560332 | 0.0 | 1.0 | -100.0 | 25.0213129 | 0.2797903 |
| 15 | 0.8997613 | 0.0001276 | 0.0 | 1.1114058 | 0.0 | 0.0002462 | 0.3164077 | 0.3165313 | 0.0 | 1.0 | -100.0 | 11.1405836 | 0.1401335 |
| 16 | 1.0 | 6.99e-06 | 0.0 | 1.0 | 0.0 | 0.0000593 | 0.2846914 | 0.2848086 | 0.0 | 1.0 | -100.0 | 0.0 | 0.0 |
| mean | sd | cv_1_valid | cv_2_valid | cv_3_valid | cv_4_valid | cv_5_valid | |
|---|---|---|---|---|---|---|---|
| accuracy | 0.9866936 | 0.0038640 | 0.9829351 | 0.9917898 | 0.9831366 | 0.9863014 | 0.9893048 |
| aic | 66.293755 | 12.12167 | 74.20859 | 56.05992 | 82.7434 | 64.225174 | 54.231678 |
| auc | 0.9980263 | 0.0006794 | 0.9978026 | 0.9976698 | 0.9974441 | 0.9980351 | 0.9991797 |
| err | 0.0133064 | 0.0038640 | 0.0170648 | 0.0082102 | 0.0168634 | 0.0136986 | 0.0106952 |
| err_count | 7.8 | 2.280351 | 10.0 | 5.0 | 10.0 | 8.0 | 6.0 |
| f0point5 | 0.9826331 | 0.0084325 | 0.9843938 | 0.9799292 | 0.969697 | 0.9872979 | 0.9918478 |
| f1 | 0.9764566 | 0.0065211 | 0.9704142 | 0.9851632 | 0.969697 | 0.9771429 | 0.9798658 |
| f2 | 0.9704682 | 0.0122690 | 0.9568262 | 0.9904535 | 0.969697 | 0.9671946 | 0.9681698 |
| lift_top_group | 3.5199234 | 0.1771113 | 3.3872833 | 3.6467066 | 3.5939393 | 3.2808988 | 3.6907895 |
| loglikelihood | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| --- | --- | --- | --- | --- | --- | --- | --- |
| mean_per_class_error | 0.0192730 | 0.0071964 | 0.0272222 | 0.0075189 | 0.0209926 | 0.0208944 | 0.0197368 |
| mse | 0.0117021 | 0.0034641 | 0.0139158 | 0.0070053 | 0.0160867 | 0.0108037 | 0.0106989 |
| null_deviance | 701.06104 | 25.972713 | 711.6207 | 715.99097 | 701.4596 | 719.9654 | 656.2684 |
| pr_auc | 0.9962468 | 0.0011169 | 0.9957380 | 0.9963841 | 0.9948334 | 0.9963907 | 0.997888 |
| precision | 0.9868586 | 0.0130284 | 0.9939394 | 0.9764706 | 0.969697 | 0.9941860 | 1.0 |
| r2 | 0.9425337 | 0.0169548 | 0.9331183 | 0.9648018 | 0.9198968 | 0.9490137 | 0.9458376 |
| recall | 0.9665772 | 0.0171743 | 0.9479769 | 0.994012 | 0.969697 | 0.9606742 | 0.9605263 |
| residual_deviance | 49.49375 | 11.132932 | 56.208588 | 38.05992 | 64.7434 | 48.225174 | 40.231678 |
| rmse | 0.1071745 | 0.0164205 | 0.1179653 | 0.0836975 | 0.1268335 | 0.1039410 | 0.1034354 |
| specificity | 0.9948767 | 0.0049765 | 0.9975787 | 0.9909502 | 0.9883177 | 0.9975370 | 1.0 |
[24 rows x 8 columns]
[tips] Use `model.explain()` to inspect the model. -- Use `h2o.display.toggle_user_tips()` to switch on/off this section.
In [64]:
import h2o
h2o.init(ip="localhost", port=54321, start_h2o=False)
from h2o.estimators import H2OXGBoostEstimator
xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42)
xgb.train(x=x, y=y, training_frame=train)
Warning: if you don't want to start local H2O server, then use of `h2o.connect()` is preferred. Checking whether there is an H2O instance running at http://localhost:54321. connected. Warning: Your H2O cluster version is (4 months and 21 days) old. There may be a newer version available. Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
| H2O_cluster_uptime: | 11 hours 55 mins |
| H2O_cluster_timezone: | Europe/Moscow |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.46.0.6 |
| H2O_cluster_version_age: | 4 months and 21 days |
| H2O_cluster_name: | H2O_from_python_25150_lkcgsm |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 7.278 Gb |
| H2O_cluster_total_cores: | 32 |
| H2O_cluster_allowed_cores: | 32 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://localhost:54321 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.11.5 final |
--------------------------------------------------------------------------- H2OResponseError Traceback (most recent call last) Cell In[64], line 6 3 from h2o.estimators import H2OXGBoostEstimator 5 xgb = H2OXGBoostEstimator(ntrees=100, max_depth=5, learn_rate=0.1, seed=42) ----> 6 xgb.train(x=x, y=y, training_frame=train) File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:107, in H2OEstimator.train(self, x, y, training_frame, offset_column, fold_column, weights_column, validation_frame, max_runtime_secs, ignored_columns, model_id, verbose) 88 """ 89 Train the H2O model. 90 (...) 101 :param bool verbose: Print scoring history to stdout. Defaults to False. 102 """ 103 parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, 104 fold_column=fold_column, weights_column=weights_column, 105 validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, 106 ignored_columns=ignored_columns, model_id=model_id, verbose=verbose) --> 107 self._train(parms, verbose=verbose) 108 return self File D:\anaconda\Lib\site-packages\h2o\estimators\estimator_base.py:186, in H2OEstimator._train(self, parms, verbose) 183 assert_is_type(verbose, bool) 185 rest_ver = self._get_rest_version(parms) --> 186 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) 187 job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) 189 if model_builder_json["messages"] is not None: File D:\anaconda\Lib\site-packages\h2o\h2o.py:123, in api(endpoint, data, json, filename, save_to) 121 # type checks are performed in H2OConnection class 122 _check_connection() --> 123 return h2oconn.request(endpoint, data=data, json=json, filename=filename, save_to=save_to) File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:499, in H2OConnection.request(self, endpoint, data, json, filename, save_to) 497 save_to = save_to(resp) 498 self._log_end_transaction(start_time, resp) --> 499 return self._process_response(resp, save_to) 501 except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e: 502 if self._local_server and not self._local_server.is_running(): File D:\anaconda\Lib\site-packages\h2o\backend\connection.py:853, in H2OConnection._process_response(response, save_to) 851 if status_code in {400, 404, 412} and isinstance(data, H2OErrorV3): 852 data.show_stacktrace = False --> 853 raise H2OResponseError(data) 855 # Server errors (notably 500 = "Server Error") 856 # Note that it is possible to receive valid H2OErrorV3 object in this case, however it merely means the server 857 # did not provide the correct status code. 858 raise H2OServerError("HTTP %d %s:\n%s" % (status_code, response.reason, data)) H2OResponseError: Server error water.exceptions.H2ONotFoundArgumentException: Error: POST /3/ModelBuilders/xgboost not found Request: POST /3/ModelBuilders/xgboost data: {'training_frame': 'py_10_sid_baac', 'nfolds': '0', 'keep_cross_validation_models': 'True', 'keep_cross_validation_predictions': 'False', 'keep_cross_validation_fold_assignment': 'False', 'score_each_iteration': 'False', 'fold_assignment': 'auto', 'response_column': 'Healthy_Status', 'ignore_const_cols': 'True', 'stopping_rounds': '0', 'stopping_metric': 'auto', 'stopping_tolerance': '0.001', 'max_runtime_secs': '0.0', 'seed': '42', 'distribution': 'auto', 'tweedie_power': '1.5', 'categorical_encoding': 'auto', 'quiet_mode': 'True', 'ntrees': '100', 'max_depth': '5', 'min_rows': '1.0', 'min_child_weight': '1.0', 'learn_rate': '0.1', 'eta': '0.3', 'sample_rate': '1.0', 'subsample': '1.0', 'col_sample_rate': '1.0', 'colsample_bylevel': '1.0', 'col_sample_rate_per_tree': '1.0', 'colsample_bytree': '1.0', 'colsample_bynode': '1.0', 'max_abs_leafnode_pred': '0.0', 'max_delta_step': '0.0', 'score_tree_interval': '0', 'min_split_improvement': '0.0', 'gamma': '0.0', 'nthread': '-1', 'build_tree_one_node': 'False', 'parallelize_cross_validation': 'True', 'calibrate_model': 'False', 'calibration_method': 'auto', 'max_bins': '256', 'max_leaves': '0', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': '0.0', 'one_drop': 'False', 'skip_drop': '0.0', 'tree_method': 'auto', 'grow_policy': 'depthwise', 'booster': 'gbtree', 'reg_lambda': '1.0', 'reg_alpha': '0.0', 'dmatrix_type': 'auto', 'backend': 'auto', 'gainslift_bins': '-1', 'auc_type': 'auto', 'scale_pos_weight': '1.0', 'score_eval_metric_only': 'False'}
In [65]:
cd path\to\your\h2o\folder
java -Xmx4g -jar h2o.jar
Cell In[65], line 1 cd path\to\your\h2o\folder ^ SyntaxError: unexpected character after line continuation character
In [66]:
import h2o
h2o.init(ip="localhost", port=54323)
Checking whether there is an H2O instance running at http://localhost:54323. connected. Warning: Your H2O cluster version is (4 months and 21 days) old. There may be a newer version available. Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
| H2O_cluster_uptime: | 51 mins 21 secs |
| H2O_cluster_timezone: | Europe/Moscow |
| H2O_data_parsing_timezone: | UTC |
| H2O_cluster_version: | 3.46.0.6 |
| H2O_cluster_version_age: | 4 months and 21 days |
| H2O_cluster_name: | 25150 |
| H2O_cluster_total_nodes: | 1 |
| H2O_cluster_free_memory: | 4 Gb |
| H2O_cluster_total_cores: | 32 |
| H2O_cluster_allowed_cores: | 32 |
| H2O_cluster_status: | locked, healthy |
| H2O_connection_url: | http://localhost:54323 |
| H2O_connection_proxy: | {"http": null, "https": null} |
| H2O_internal_security: | False |
| Python_version: | 3.11.5 final |
In [ ]: